/***
This do-file creates a scatterplot showing the industry share of each 2-digit NAICS 
code of job postings in the Job Openings and Labor Turnover Survey (JOLTS) data 
in January 2020 vs. the corresponding industry share in job postings in Burning 
Glass data in January 2020. 
***/

*-------------------------------------------------------------------------------
* Set up
*-------------------------------------------------------------------------------

* Set $root 
project figstabs, root
if (r(buildrunning)==0) include "${root}/code/config_interactive.do"

* Set globals
project, uses("${root}/code/set_globals.do")
include "${root}/code/set_globals.do"
local category "Jobs"

* Create required subfolders
cap mkdir "${root}/results/`category'"
cap mkdir "${root}/results/paper numbers"
cap mkdir "${root}/results/paper numbers/`category'"

* Erase output numbers
cap erase "${root}/results/paper numbers/`category'/Industry Shares of Job Postings in Burning Glass and Job Openings in JOLTS.yaml"

*-------------------------------------------------------------------------------
* Load and process data
*-------------------------------------------------------------------------------

project, uses("${root}/data/dvc/JOLTS/Job Openings.txt")
import delimited "${root}/data/dvc/JOLTS/Job Openings.txt", clear
drop footnote_codes
replace series_id = strtrim(series_id)

* Drop data series that we don't need
drop if substr(series_id, strlen(series_id), 1) == "R" // drop rate changes
drop if substr(series_id, 10, 2) != "00" // keep only the national series

* Create new vars using codebook.
gen seasonally_adjusted = substr(series_id, 3, 1) == "S"
drop if seasonally_adjusted == 1
gen naics = real(substr(series_id, 4, 6))

* Format date variable
gen date = ym(year, real(substr(period, 2, 2)))
drop year period series_id
format date %tm
keep if inrange(date, ym(2018, 1), ym(2020, 4))

replace value = value * 1000

* Only keep codes that match up with NAICS
drop if inlist(naics, 0, 100000, 110099, 320000, 340000, 400000, 510099, 600000, 700000, 910000, 920000, 923000, 929000)
replace naics = 310000 if naics == 300000
replace naics = 480000 if naics == 480099
replace naics = 920000 if naics == 900000
rename naics naics_temp
gen int naics = naics_temp / 10000
drop naics_temp

* Label naics codes
#delimit ;
label define naics
		23 "Construction"
		31 "Manufacturing"
		42 "Wholesale Trade"
		44 "Retail Trade"
		48 "Transportation & Warehousing (Util?)"
		51 "Information"
		52 "Finance and Insurance"
		53 "Real estate and leasing"
		54 "Professional Services"
		61 "Educational Services"
		62 "Health Care and Social Asst."
		71 "Arts, Entmnt, and Rec."
		72 "Accomodation and Food Services"
		81 "Other services"
		92 "Public admin.";
#delimit cr
label values naics naics

* Map these codes to the codes we're using for BG
gen bg_code = .
replace bg_code = 11 if naics == 31
replace bg_code = 5 if naics == 23
replace bg_code = 20 if naics == 42
replace bg_code = 17 if naics == 44
replace bg_code = 18 if naics == 48
replace bg_code = 9 if naics == 51
replace bg_code = 7 if naics == 52
replace bg_code = 16 if naics == 53
replace bg_code = 14 if naics == 54
replace bg_code = 6 if naics == 61
replace bg_code = 8 if naics == 62
replace bg_code = 4 if naics == 71
replace bg_code = 1 if naics == 72
replace bg_code = 13 if naics == 81
replace bg_code = 15 if naics == 92

rename value jolt_job_openings
gcollapse (sum) jolt_job_openings, by(bg_code naics date)
rename naics industry
tostring industry, replace force

tempfile jolts
save `jolts'

*-------------------------------------------------------------------------------
* Check that we get 98.5%+ in JOLTS industries
*-------------------------------------------------------------------------------
project, uses("${root}/data/web/data/Job Postings Industry Shares - National - 2020.csv")
import delimited "${root}/data/web/data/Job Postings Industry Shares - National - 2020.csv", clear
replace industry = "54" if industry == "56"

* Get national series
gcollapse (sum) share_jan2020, by(industry)

* Create date var for merge
gen date = ym(2020, 1)
format date %tm

merge 1:1 date industry using `jolts', keep(2 3)

replace industry = "9999" if _merge==3
gcollapse (sum) share_jan2020, by(industry)
gegen tot = total(share_jan2020)
gen share = share_jan2020 / tot

assert share > 0.985 & !mi(share) if industry == "9999"

*-------------------------------------------------------------------------------
* Make comparison plot
*-------------------------------------------------------------------------------
project, uses("${root}/data/web/data/Job Postings Industry Shares - National - 2020.csv")
import delimited "${root}/data/web/data/Job Postings Industry Shares - National - 2020.csv", clear

replace industry = "54" if industry == "56"
replace industry = "31" if industry == "31-33"
replace industry = "44" if industry == "44-45"
replace industry = "48" if industry == "48-49"

tab industry

gen ss = .
replace ss = 10 if inlist(industry, "11", "21")
replace ss = 20 if inlist(industry, "23")
replace ss = 30 if inlist(industry, "31", "32", "33")
replace ss = 40 if inlist(industry, "42", "44", "45", "48", "49", "22")
replace ss = 50 if inlist(industry, "51")
replace ss = 55 if inlist(industry, "52", "53")
replace ss = 60 if inlist(industry, "54", "55", "56")
replace ss = 65 if inlist(industry, "61", "62")
replace ss = 70 if inlist(industry, "71", "72")
replace ss = 80 if inlist(industry, "81")
replace ss = 90 if inlist(industry, "91", "92", "93")
assert ~mi(ss)

gcollapse (sum) share_jan2020, by(industry ss)

gen date = ym(2020, 1)
format date %tm

merge 1:1 date industry using `jolts', keep(3)

gcollapse (sum) jolt_job_openings share_jan2020, by(industry)

gegen total_jolts = total(jolt_job_openings)
gegen total_bg = total(share_jan2020)

gen share_jolts = 100 * jolt_job_openings/total_jolts
gen share_bg = 100 * share_jan2020/total_bg

* Compute correlation 
corr share_jolts share_bg [w = jolt_job_openings]
corr share_jolts share_bg [w = jolt_job_openings] if industry != "92" 
local corr: di %4.2f `r(rho)'

* Scatter 
tw ///
	(function y = x, ra(0 25) color(oi2)) /// 
	(scatter share_jolts share_bg, mlab(industry) msymbol(i) mlabcolor(oi1)) /// 
	, /// 
	xtitle("Industry Share of Lightcast" "Job Postings in January 2020 (%)") /// 
	ytitle("Industry Share of JOLTS" "Job Openings in January 2020 (%)") /// 
	ylab(0 "0%" 5 "5%" 10 "10%" 15 "15%" 20 "20%" 25 "25%", nogrid) /// 
	xlab(0 "0%" 5 "5%" 10 "10%" 15 "15%" 20 "20%" 25 "25%", nogrid) /// 
	legend(off) /// 
	text(0 20 "Correlation: `corr'", color(gs7)) /// 
	aspect(1) 
oi_graph_export "${root}/results/Jobs/Job Openings BG vs JOLTS", type(${fig_type})

*-------------------------------------------------------------------------------
* Export output numbers to csv file
*-------------------------------------------------------------------------------

yamlout using "${root}/results/paper numbers/`category'/Industry Shares of Job Postings in Burning Glass and Job Openings in JOLTS.yaml", ///
	key("bg_jolts_corr") ///
	comment("Correlation BG vs JOLTS") ///
	value(`corr') fmt(%9.2f)

project, creates("${root}/results/paper numbers/`category'/Industry Shares of Job Postings in Burning Glass and Job Openings in JOLTS.yaml")
